This notebook compares doublet results calculated on benchmarking datasets to one another, with the primary goal of addressing these questions:
There are several items to bear in mind when interpreting these results:
cxds, as we are using it, does not have a specific
threshold for calling droplets. By contrast, both scublet
and scDblFinder identify a threshold based on the given
dataset they are processing. This notebook used a doublet threshold of
>=0.75 for cxds, which may not be
universally suitable (though choosing a universally suitable threshold
is not easy either!).suppressPackageStartupMessages({
library(SingleCellExperiment)
library(ggplot2)
library(patchwork)
library(caret)
library(UpSetR)
})
theme_set(theme_bw())
module_base <- rprojroot::find_root(rprojroot::is_renv_project)
data_dir <- file.path(module_base, "scratch", "benchmark-datasets")
result_dir <- file.path(module_base, "results", "benchmark-results")
plot_pca_calls <- function(df,
color_column,
pred_column,
color_lab) {
# Plot PCs colored by singlet/doublet, showing doublets on top
# df is expected to contain columns PC1, PC2, `color_column`, and `pred_column`. These should _not_ be provided as strings
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
scale_color_manual(name = color_lab, values = c("black", "lightblue")) +
geom_point(
data = dplyr::filter(df, {{color_column}} == "doublet"),
color = "black",
size = 0.75
) +
theme(
legend.title.position = "top",
legend.position = "bottom"
)
}
plot_pca_metrics <- function(df, color_column, false_colors) {
# Plot PCs colored by performance metric, showing false calls on top
# false_colors is a vector of colors used for fn and fp points
# df is expected to contain columns PC1, PC2, and `color_column`. This should _not_ be provided as a string.
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
geom_point(
data = dplyr::filter(df, {{color_column}} %in% false_colors),
size = 0.75
) +
scale_color_identity() +
theme(legend.position = "none")
}
First, we’ll read in and combine doublet results into a list of data frames for each dataset. We’ll also create new columns for each dataset:
consensus_call, which will be “doublet” if all
methods predict “doublet,” and “singlet” otherwisecall_type, which will classify the consensus call as
one of “tp”, “tn”, “fp”, or “fn” (true/false positive/negative)# find all dataset names to process:
dataset_names <- list.files(result_dir, pattern = "*_scrublet.tsv") |>
stringr::str_remove("_scrublet.tsv")
# used in PCA plots
confusion_colors <- c(
"tp" = "lightblue",
"tn" = "pink",
"fp" = "blue",
"fn" = "firebrick2"
)
# Read in and data for analysis
doublet_df_list <- dataset_names |>
purrr::map(
\(dataset) {
scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv"))
scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv"))
sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}_sce.rds"))
scdbl_df <- scdbl_tsv |>
readr::read_tsv(show_col_types = FALSE) |>
dplyr::select(
barcodes,
cxds_score,
scdbl_score = score,
scdbl_prediction = class
) |>
# add cxds calls at 0.75 threshold
dplyr::mutate(
cxds_prediction = dplyr::if_else(
cxds_score >= 0.75,
"doublet",
"singlet"
)
)
scrub_df <- readr::read_tsv(scrub_tsv, show_col_types = FALSE)
# grab ground truth and PCA coordinates
sce <- readr::read_rds(sce_file)
sce_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
tibble::rownames_to_column(var = "barcodes") |>
dplyr::select(barcodes,
ground_truth = ground_truth_doublets,
PC1 = PCA.1,
PC2 = PCA.2)
rm(sce)
dataset_df <- scdbl_df |>
dplyr::left_join(
scrub_df,
by = "barcodes"
) |>
dplyr::left_join(
sce_df,
by = "barcodes"
)
# Add a consensus call column
dataset_df <- dataset_df |>
dplyr::rowwise() |>
dplyr::mutate(consensus_call = dplyr::if_else(
all(
c(scdbl_prediction, scrublet_prediction, cxds_prediction) == "doublet"
),
"doublet",
"singlet"
)) |>
dplyr::mutate(
call_type = dplyr::case_when(
consensus_call == "doublet" && ground_truth == "doublet" ~ "tp",
consensus_call == "singlet" && ground_truth == "singlet" ~ "tn",
consensus_call == "doublet" && ground_truth == "singlet" ~ "fp",
consensus_call == "singlet" && ground_truth == "doublet" ~ "fn"
),
# set associated plotting colors
call_type_color = dplyr::case_when(
call_type == "tp" ~ unname(confusion_colors["tp"]),
call_type == "tn" ~ unname(confusion_colors["tn"]),
call_type == "fp" ~ unname(confusion_colors["fp"]),
call_type == "fn" ~ unname(confusion_colors["fn"])
)
)
return(dataset_df)
}
)
names(doublet_df_list) <- dataset_names
This section contains upset plots that show overlap across doublet calls from each method, displayed for each dataset.
pull_barcodes <- function(df, pred_var) {
# Helper function to pull out barcodes for doublet calls
df$barcodes[df[[pred_var]] == "doublet"]
}
upset_list <- doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
doublet_barcodes <- list(
"scDblFinder" = pull_barcodes(df, "scdbl_prediction"),
"scrublet" = pull_barcodes(df, "scrublet_prediction"),
"cxds" = pull_barcodes(df, "cxds_prediction")
)
UpSetR::upset(fromList(doublet_barcodes), order.by = "freq") |> print()
grid::grid.text( # plot title
dataset,
x = 0.65,
y = 0.95,
gp = grid::gpar(fontsize=16)
)
}
)
This section visualizes and evaluates the consensus doublet calls across each dataset.
This section plots the PCA for each dataset, with three color schemes from left to right:
tp), true negative (tn),
false positive (fp), false negative (fn)# Make a legend for the confusion-colored PCA
legend_plot <- data.frame(
x = factor(names(confusion_colors), levels = names(confusion_colors)), y = 1:4
) |>
ggplot(aes(x = x, y = y, color = x)) +
geom_point(size = 3) +
scale_color_manual(name = "Metric", values = confusion_colors)
confusion_legend <- ggpubr::get_legend(legend_plot) |> ggpubr::as_ggplot()
doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
# First, ground truth
p1 <- plot_pca_calls(
df,
color_column = ground_truth,
color_lab = "Ground truth"
)
# Second, consensus call
p2 <- plot_pca_calls(
df,
color_column = consensus_call,
color_lab = "Consensus call"
)
# Third, call type
p3 <- plot_pca_metrics(
df,
call_type_color,
false_colors = unname(c(confusion_colors["fn"], confusion_colors["fp"]))
)
# combine and plot
plot( p1 + p2 + p3 + confusion_legend + plot_annotation(glue::glue("PCA for {dataset}")) + plot_layout(ncol=4, widths = c(1,1,1,0.25)) )
}
)
This section calculates a confusion matrix and associated statistics on the consensus calls.
metric_df <- doublet_df_list |>
purrr::imap(
\(df, dataset) {
print(glue::glue("======================== {dataset} ========================"))
cat("Table of consensus calls:")
print(table(df$consensus_call))
cat("\n\n")
confusion_result <- caret::confusionMatrix(
# truth should be first
table(
"Truth" = df$ground_truth,
"Consensus prediction" = df$consensus_call
),
positive = "doublet"
)
print(confusion_result)
# Extract information we want to present later in a table
tibble::tibble(
"Dataset name" = dataset,
"Kappa" = round(confusion_result$overall["Kappa"], 3),
"Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
)
}
) |>
dplyr::bind_rows()
======================== hm-6k ========================
Table of consensus calls:
doublet singlet
62 6744
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 62 109
singlet 0 6635
Accuracy : 0.984
95% CI : (0.9807, 0.9868)
No Information Rate : 0.9909
P-Value [Acc > NIR] : 1
Kappa : 0.5258
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.00000
Specificity : 0.98384
Pos Pred Value : 0.36257
Neg Pred Value : 1.00000
Prevalence : 0.00911
Detection Rate : 0.00911
Detection Prevalence : 0.02512
Balanced Accuracy : 0.99192
'Positive' Class : doublet
======================== HMEC-orig-MULTI ========================
Table of consensus calls:
doublet singlet
247 26179
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 204 3364
singlet 43 22815
Accuracy : 0.8711
95% CI : (0.867, 0.8751)
No Information Rate : 0.9907
P-Value [Acc > NIR] : 1
Kappa : 0.0911
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.825911
Specificity : 0.871500
Pos Pred Value : 0.057175
Neg Pred Value : 0.998119
Prevalence : 0.009347
Detection Rate : 0.007720
Detection Prevalence : 0.135019
Balanced Accuracy : 0.848705
'Positive' Class : doublet
======================== pbmc-1B-dm ========================
Table of consensus calls:
doublet singlet
13 3777
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 8 122
singlet 5 3655
Accuracy : 0.9665
95% CI : (0.9603, 0.972)
No Information Rate : 0.9966
P-Value [Acc > NIR] : 1
Kappa : 0.1063
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.615385
Specificity : 0.967699
Pos Pred Value : 0.061538
Neg Pred Value : 0.998634
Prevalence : 0.003430
Detection Rate : 0.002111
Detection Prevalence : 0.034301
Balanced Accuracy : 0.791542
'Positive' Class : doublet
======================== pdx-MULTI ========================
Table of consensus calls:
doublet singlet
4 10292
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 3 1314
singlet 1 8978
Accuracy : 0.8723
95% CI : (0.8657, 0.8787)
No Information Rate : 0.9996
P-Value [Acc > NIR] : 1
Kappa : 0.0038
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7500000
Specificity : 0.8723280
Pos Pred Value : 0.0022779
Neg Pred Value : 0.9998886
Prevalence : 0.0003885
Detection Rate : 0.0002914
Detection Prevalence : 0.1279138
Balanced Accuracy : 0.8111640
'Positive' Class : doublet
Overall, methods do not have substantial overlap with out another. They each tend to detect different sets of doublets, leading to fairly small sets of consensus doublets. Further, the consensus doublets called by all three methods have some, but not substantial, overlap with the ground truth.
For three out of four datasets, scDblFinder predicts a
much larger number of doublets compared to other methods. For
pdx-MULTI, however, cxds predicts a much
larger number of droplets.
The table below summarizes performance of the “consensus caller”.
Note that, in the benchmarking paper
these datasets were originally analyzed in, hm-6k was
observed to be one of the “easiest” datasets to classify across
methods.
Consistent with that observation, it has the highest kappa
value here, although it is still fairly low - though not as low as the
other methods, which are very close to 0.
metric_df
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] UpSetR_1.4.0 caret_6.0-94
[3] lattice_0.22-6 patchwork_1.2.0
[5] ggplot2_3.5.1 SingleCellExperiment_1.26.0
[7] SummarizedExperiment_1.34.0 Biobase_2.64.0
[9] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
[11] IRanges_2.38.0 S4Vectors_0.42.0
[13] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
[15] matrixStats_1.3.0
loaded via a namespace (and not attached):
[1] pROC_1.18.5 gridExtra_2.3
[3] rlang_1.1.3 magrittr_2.0.3
[5] e1071_1.7-14 compiler_4.4.0
[7] DelayedMatrixStats_1.26.0 vctrs_0.6.5
[9] reshape2_1.4.4 stringr_1.5.1
[11] pkgconfig_2.0.3 crayon_1.5.2
[13] fastmap_1.1.1 backports_1.4.1
[15] XVector_0.44.0 labeling_0.4.3
[17] scuttle_1.14.0 utf8_1.2.4
[19] rmarkdown_2.26 tzdb_0.4.0
[21] prodlim_2023.08.28 UCSC.utils_1.0.0
[23] bit_4.0.5 purrr_1.0.2
[25] xfun_0.43 beachmat_2.20.0
[27] zlibbioc_1.50.0 cachem_1.0.8
[29] jsonlite_1.8.8 recipes_1.0.10
[31] highr_0.10 DelayedArray_0.30.1
[33] BiocParallel_1.38.0 broom_1.0.5
[35] parallel_4.4.0 R6_2.5.1
[37] bslib_0.7.0 stringi_1.8.4
[39] car_3.1-2 parallelly_1.37.1
[41] rpart_4.1.23 lubridate_1.9.3
[43] jquerylib_0.1.4 Rcpp_1.0.12
[45] iterators_1.0.14 knitr_1.46
[47] future.apply_1.11.2 readr_2.1.5
[49] Matrix_1.7-0 splines_4.4.0
[51] nnet_7.3-19 timechange_0.3.0
[53] tidyselect_1.2.1 abind_1.4-5
[55] yaml_2.3.8 timeDate_4032.109
[57] codetools_0.2-20 listenv_0.9.1
[59] tibble_3.2.1 plyr_1.8.9
[61] withr_3.0.0 evaluate_0.23
[63] future_1.33.2 survival_3.5-8
[65] proxy_0.4-27 ggpubr_0.6.0
[67] pillar_1.9.0 BiocManager_1.30.23
[69] carData_3.0-5 renv_1.0.7
[71] foreach_1.5.2 generics_0.1.3
[73] vroom_1.6.5 rprojroot_2.0.4
[75] hms_1.1.3 sparseMatrixStats_1.16.0
[77] munsell_0.5.1 scales_1.3.0
[79] globals_0.16.3 class_7.3-22
[81] glue_1.7.0 tools_4.4.0
[83] data.table_1.15.4 ggsignif_0.6.4
[85] ModelMetrics_1.2.2.2 gower_1.0.1
[87] cowplot_1.1.3 grid_4.4.0
[89] tidyr_1.3.1 ipred_0.9-14
[91] colorspace_2.1-0 nlme_3.1-164
[93] GenomeInfoDbData_1.2.12 cli_3.6.2
[95] fansi_1.0.6 S4Arrays_1.4.0
[97] lava_1.8.0 dplyr_1.1.4
[99] gtable_0.3.5 rstatix_0.7.2
[101] sass_0.4.9 digest_0.6.35
[103] SparseArray_1.4.3 farver_2.1.1
[105] htmltools_0.5.8.1 lifecycle_1.0.4
[107] hardhat_1.3.1 httr_1.4.7
[109] bit64_4.0.5 MASS_7.3-60.2